Source code for nlp_architect.utils.embedding

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
from __future__ import absolute_import, division, print_function, unicode_literals

import logging
import os
import sys
from typing import List

import numpy as np

from gensim.models import FastText

from nlp_architect.utils.text import Vocabulary

logger = logging.getLogger(__name__)


[docs]def load_word_embeddings(file_path, vocab=None): """ Loads a word embedding model text file into a word(str) to numpy vector dictionary Args: file_path (str): path to model file vocab (list of str): optional - vocabulary Returns: list: a dictionary of numpy.ndarray vectors int: detected word embedding vector size """ with open(file_path, encoding="utf-8") as fp: word_vectors = {} size = None for line in fp: line_fields = line.split() if len(line_fields) < 5: continue else: if line[0] == " ": word_vectors[" "] = np.asarray(line_fields, dtype="float32") elif vocab is None or line_fields[0] in vocab: word_vectors[line_fields[0]] = np.asarray(line_fields[1:], dtype="float32") if size is None: size = len(line_fields[1:]) return word_vectors, size
[docs]def fill_embedding_mat(src_mat, src_lex, emb_lex, emb_size): """ Creates a new matrix from given matrix of int words using the embedding model provided. Args: src_mat (numpy.ndarray): source matrix src_lex (dict): source matrix lexicon emb_lex (dict): embedding lexicon emb_size (int): embedding vector size """ emb_mat = np.zeros((src_mat.shape[0], src_mat.shape[1], emb_size)) for i, sen in enumerate(src_mat): for j, w in enumerate(sen): if w > 0: w_emb = emb_lex.get(str(src_lex.get(w)).lower()) if w_emb is not None: emb_mat[i][j] = w_emb return emb_mat
[docs]def get_embedding_matrix( embeddings: dict, vocab: Vocabulary, embedding_size: int = None ) -> np.ndarray: """ Generate a matrix of word embeddings given a vocabulary Args: embeddings (dict): a dictionary of embedding vectors vocab (Vocabulary): a Vocabulary embedding_size (int): custom embedding matrix size Returns: a 2D numpy matrix of lexicon embeddings """ emb_size = len(next(iter(embeddings.values()))) if embedding_size: mat = np.zeros((embedding_size, emb_size)) else: mat = np.zeros((len(vocab), emb_size)) for word, wid in vocab.vocab.items(): vec = embeddings.get(word.lower(), None) if vec is not None: mat[wid] = vec return mat
[docs]def load_embedding_file(filename: str) -> dict: """Load a word embedding file Args: filename (str): path to embedding file Returns: dict: dictionary with embedding vectors """ if filename is not None and os.path.exists(filename): logger.info("Loading external word embeddings from {}".format(filename)) embedding_dict = {} with open(filename, encoding="utf-8") as fp: for line in fp: split_line = line.split() word = split_line[0] vec = np.array([float(val) for val in split_line[1:]]) embedding_dict[word] = vec return embedding_dict
# pylint: disable=not-context-manager
[docs]class ELMoEmbedderTFHUB(object): def __init__(self): try: import tensorflow as tf import tensorflow_hub as hub except (AttributeError, ImportError): logger.error( "tensorflow_hub is not installed, " + "please install nlp_architect with [all] package. " + "for example: pip install nlp_architect[all]" ) sys.exit() self.g = tf.Graph() with self.g.as_default(): text_input = tf.compat.v1.placeholder(dtype=tf.string) text_input_size = tf.compat.v1.placeholder(dtype=tf.int32) print( "Loading Tensorflow hub ELMo model, " "might take a while on first load (model is downloaded from web)" ) self.elmo = hub.Module("https://tfhub.dev/google/elmo/3", trainable=False) self.inputs = {"tokens": text_input, "sequence_len": text_input_size} self.embedding = self.elmo(inputs=self.inputs, signature="tokens", as_dict=True)["elmo"] sess = tf.compat.v1.Session(graph=self.g) sess.run(tf.compat.v1.global_variables_initializer()) sess.run(tf.compat.v1.tables_initializer()) self.s = sess
[docs] def get_vector(self, tokens): vec = self.s.run( self.embedding, feed_dict={self.inputs["tokens"]: [tokens], self.inputs["sequence_len"]: [len(tokens)]}, ) return np.squeeze(vec, axis=0)
[docs]class FasttextEmbeddingsModel(object): """Fasttext embedding trainer class Args: texts (List[List[str]]): list of tokenized sentences size (int): embedding size epochs (int, optional): number of epochs to train window (int, optional): The maximum distance between the current and predicted word within a sentence """ def __init__(self, size: int = 5, window: int = 3, min_count: int = 1, skipgram: bool = True): model = FastText(size=size, window=window, min_count=min_count, sg=skipgram) self.model = model
[docs] def train(self, texts: List[List[str]], epochs: int = 100): self.model.build_vocab(texts) self.model.train(sentences=texts, total_examples=len(texts), epochs=epochs)
[docs] def vec(self, word: str) -> np.ndarray: """return vector corresponding given word """ return self.model.wv[word]
def __getitem__(self, item): return self.vec(item)
[docs] def save(self, path) -> None: """save model to path """ self.model.save(path)
[docs] @classmethod def load(cls, path): """load model from path """ loaded_model = FastText.load(path) new_model = cls() new_model.model = loaded_model return new_model